package belloCollector.crawler;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

public class SougouWechatArticleListCrawler extends BreadthCrawler {

	public SougouWechatArticleListCrawler(String crawlPath, boolean autoParse) {
		super(crawlPath, autoParse);
	}

	@Override
	public void visit(Page page, CrawlDatums next) {
		System.out.println(page.url());
		// 逐个获取列表页的信息
		Elements elements = page.select("li");
		String url;
		String title;
		String txtInfo;
		String src;
		for (Element element : elements) {
			url = element.select(".txt-box h3>a").attr("abs:href");
			title = element.select(".txt-box h3>a").text();
			txtInfo = element.select(".txt-box").text();
			src = element.select(".s-p").text();
			String line = String.format("标题：%s,\n来源：%s,\n地址：%s,\n简介：%s", title, src, url, txtInfo);
			System.out.println(line + "\n--------------------------------");
		}
	}

	public static void main(String[] args) throws Exception {
		// 获取种子url集合
		CrawlDatums datums = new CrawlDatums();
		CrawlDatum datum = null;
		// String url = "http://weixin.sogou.com/pcindex/pc/pc_0/pc_0.html";
		String url_0 = "http://weixin.sogou.com/pcindex/pc/pc_%s";
		String seedUrl = "";
		for (int i = 0; i < 1; i++) {//最大19
			String url = String.format(url_0, i).concat("/%s.html");
			for (int m = 0; m < 16; m++) {
				if (m == 0) {
					seedUrl = String.format(url, "pc_" + i);
				} else {
					seedUrl = String.format(url, m);
				}
				datum = new CrawlDatum(seedUrl);
				datums.add(datum);
			}
		}
		SougouWechatArticleListCrawler articleCrawler = new SougouWechatArticleListCrawler("搜狗文章", true);
		articleCrawler.addSeed(datums.get(0));
		// articleCrawler.setTopN(2);
		articleCrawler.start(1);
	}
}
